{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Entities Recognition"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-info\">\n",
    "\n",
    "This tutorial is available as an IPython notebook at [Malaya/example/entities](https://github.com/huseinzol05/Malaya/tree/master/example/entities).\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-warning\">\n",
    "\n",
    "This module only trained on standard language structure, so it is not save to use it for local language structure.\n",
    "    \n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp0npqw77q\n",
      "INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp0npqw77q/_remote_module_non_scriptable.py\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.89 s, sys: 3.49 s, total: 6.38 s\n",
      "Wall time: 2.3 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3397\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:214: FutureWarning: Possible nested set at position 3927\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Describe supported entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'Tag': 'OTHER', 'Description': 'other'},\n",
       " {'Tag': 'law',\n",
       "  'Description': 'law, regulation, related law documents, documents, etc'},\n",
       " {'Tag': 'location', 'Description': 'location, place'},\n",
       " {'Tag': 'organization',\n",
       "  'Description': 'organization, company, government, facilities, etc'},\n",
       " {'Tag': 'person',\n",
       "  'Description': 'person, group of people, believes, unique arts (eg; food, drink), etc'},\n",
       " {'Tag': 'quantity', 'Description': 'numbers, quantity'},\n",
       " {'Tag': 'time', 'Description': 'date, day, time, etc'},\n",
       " {'Tag': 'event', 'Description': 'unique event happened, etc'}]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.entity.describe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List available HuggingFace NER models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mesolitica/ner-t5-tiny-standard-bahasa-cased': {'Size (MB)': 84.7,\n",
       "  'law': {'precision': 0.9642625081221572,\n",
       "   'recall': 0.9598965071151359,\n",
       "   'f1': 0.9620745542949757,\n",
       "   'number': 1546},\n",
       "  'person': {'precision': 0.9673319980661648,\n",
       "   'recall': 0.971424608128728,\n",
       "   'f1': 0.9693739834584906,\n",
       "   'number': 14418},\n",
       "  'time': {'precision': 0.9796992481203007,\n",
       "   'recall': 0.983148893360161,\n",
       "   'f1': 0.9814210394175245,\n",
       "   'number': 3976},\n",
       "  'location': {'precision': 0.966455899689208,\n",
       "   'recall': 0.9753406878650227,\n",
       "   'f1': 0.970877967379017,\n",
       "   'number': 9246},\n",
       "  'organization': {'precision': 0.9308265342319971,\n",
       "   'recall': 0.9475204622051036,\n",
       "   'f1': 0.9390993140471219,\n",
       "   'number': 8308},\n",
       "  'quantity': {'precision': 0.9824689554419284,\n",
       "   'recall': 0.9853479853479854,\n",
       "   'f1': 0.9839063643013899,\n",
       "   'number': 2730},\n",
       "  'event': {'precision': 0.8535980148883374,\n",
       "   'recall': 0.8973913043478261,\n",
       "   'f1': 0.8749470114455278,\n",
       "   'number': 1150},\n",
       "  'overall_precision': 0.9585080133195985,\n",
       "  'overall_recall': 0.9670566055977183,\n",
       "  'overall_f1': 0.9627633336140621,\n",
       "  'overall_accuracy': 0.9951433495221682},\n",
       " 'mesolitica/ner-t5-small-standard-bahasa-cased': {'Size (MB)': 141,\n",
       "  'law': {'precision': 0.9320327249842668,\n",
       "   'recall': 0.9579560155239327,\n",
       "   'f1': 0.9448165869218501,\n",
       "   'number': 1546},\n",
       "  'person': {'precision': 0.9745341614906833,\n",
       "   'recall': 0.9794007490636704,\n",
       "   'f1': 0.976961394769614,\n",
       "   'number': 14418},\n",
       "  'time': {'precision': 0.9583539910758553,\n",
       "   'recall': 0.9723340040241448,\n",
       "   'f1': 0.9652933832709114,\n",
       "   'number': 3976},\n",
       "  'location': {'precision': 0.9709677419354839,\n",
       "   'recall': 0.9766385463984426,\n",
       "   'f1': 0.9737948883856357,\n",
       "   'number': 9246},\n",
       "  'organization': {'precision': 0.9493625210488333,\n",
       "   'recall': 0.9500481463649495,\n",
       "   'f1': 0.9497052099627,\n",
       "   'number': 8308},\n",
       "  'quantity': {'precision': 0.9823008849557522,\n",
       "   'recall': 0.9758241758241758,\n",
       "   'f1': 0.9790518191841234,\n",
       "   'number': 2730},\n",
       "  'event': {'precision': 0.8669991687448046,\n",
       "   'recall': 0.9069565217391304,\n",
       "   'f1': 0.88652783680408,\n",
       "   'number': 1150},\n",
       "  'overall_precision': 0.9629220498535133,\n",
       "  'overall_recall': 0.9691593754531832,\n",
       "  'overall_f1': 0.9660306446949986,\n",
       "  'overall_accuracy': 0.9953954840983863}}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.entity.available_huggingface"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar  sekiranya mengantuk ketika memandu.'\n",
    "string1 = 'memperkenalkan Husein, dia sangat comel, berumur 25 tahun, bangsa melayu, agama islam, tinggal di cyberjaya malaysia, bercakap bahasa melayu, semua membaca buku undang-undang kewangan, dengar laju Siti Nurhaliza - Seluruh Cinta sambil makan ayam goreng KFC'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load HuggingFace model\n",
    "\n",
    "```python\n",
    "def huggingface(\n",
    "    model: str = 'mesolitica/ner-t5-small-standard-bahasa-cased',\n",
    "    force_check: bool = True,\n",
    "    **kwargs,\n",
    "):\n",
    "    \"\"\"\n",
    "    Load HuggingFace model to Entity Recognition.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    model: str, optional (default='mesolitica/ner-t5-small-standard-bahasa-cased')\n",
    "        Check available models at `malaya.entity.available_huggingface`.\n",
    "    force_check: bool, optional (default=True)\n",
    "        Force check model one of malaya model.\n",
    "        Set to False if you have your own huggingface model.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: malaya.torch_model.huggingface.Tagging\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d4d4a60292184f869008b4924ed4757c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)okenizer_config.json:   0%|          | 0.00/21.2k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f7fe54baf2c649398992679a0ceb1cff",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3a4b97c67c7447789f51ce991b5bf864",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.44M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "156d9b5af2104136acbf49c156c54b17",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)in/added_tokens.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f31341bf3e894dae84f5800453111e96",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.23k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e115e1c70a974db0a559469f4c927371",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cf089f56611d4778b8d0b277e10779db",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading model.safetensors:   0%|          | 0.00/141M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "model = malaya.entity.huggingface()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "_ = model.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Predict\n",
    "\n",
    "```python\n",
    "def predict(self, string: str):\n",
    "    \"\"\"\n",
    "    Tag a string.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    string : str\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: Tuple[str, str]\n",
    "    \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('KUALA', 'location'),\n",
       " ('LUMPUR:', 'location'),\n",
       " ('Sempena', 'OTHER'),\n",
       " ('sambutan', 'OTHER'),\n",
       " ('Aidilfitri', 'event'),\n",
       " ('minggu', 'OTHER'),\n",
       " ('depan,', 'OTHER'),\n",
       " ('Perdana', 'person'),\n",
       " ('Menteri', 'person'),\n",
       " ('Tun', 'person'),\n",
       " ('Dr', 'person'),\n",
       " ('Mahathir', 'person'),\n",
       " ('Mohamad', 'person'),\n",
       " ('dan', 'OTHER'),\n",
       " ('Menteri', 'person'),\n",
       " ('Pengangkutan', 'person'),\n",
       " ('Anthony', 'person'),\n",
       " ('Loke', 'person'),\n",
       " ('Siew', 'person'),\n",
       " ('Fook', 'person'),\n",
       " ('menitipkan', 'OTHER'),\n",
       " ('pesanan', 'OTHER'),\n",
       " ('khas', 'OTHER'),\n",
       " ('kepada', 'OTHER'),\n",
       " ('orang', 'OTHER'),\n",
       " ('ramai', 'OTHER'),\n",
       " ('yang', 'OTHER'),\n",
       " ('mahu', 'OTHER'),\n",
       " ('pulang', 'OTHER'),\n",
       " ('ke', 'OTHER'),\n",
       " ('kampung', 'OTHER'),\n",
       " ('halaman', 'OTHER'),\n",
       " ('masing-masing.', 'OTHER'),\n",
       " ('Dalam', 'OTHER'),\n",
       " ('video', 'OTHER'),\n",
       " ('pendek', 'OTHER'),\n",
       " ('terbitan', 'OTHER'),\n",
       " ('Jabatan', 'organization'),\n",
       " ('Keselamatan', 'organization'),\n",
       " ('Jalan', 'organization'),\n",
       " ('Raya', 'organization'),\n",
       " ('(JKJR)', 'organization'),\n",
       " ('itu,', 'OTHER'),\n",
       " ('Dr', 'person'),\n",
       " ('Mahathir', 'person'),\n",
       " ('menasihati', 'OTHER'),\n",
       " ('mereka', 'OTHER'),\n",
       " ('supaya', 'OTHER'),\n",
       " ('berhenti', 'OTHER'),\n",
       " ('berehat', 'OTHER'),\n",
       " ('dan', 'OTHER'),\n",
       " ('tidur', 'OTHER'),\n",
       " ('sebentar', 'OTHER'),\n",
       " ('sekiranya', 'OTHER'),\n",
       " ('mengantuk', 'OTHER'),\n",
       " ('ketika', 'OTHER'),\n",
       " ('memandu.', 'OTHER')]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('memperkenalkan', 'OTHER'),\n",
       " ('Husein,', 'person'),\n",
       " ('dia', 'OTHER'),\n",
       " ('sangat', 'OTHER'),\n",
       " ('comel,', 'OTHER'),\n",
       " ('berumur', 'OTHER'),\n",
       " ('25', 'OTHER'),\n",
       " ('tahun,', 'OTHER'),\n",
       " ('bangsa', 'OTHER'),\n",
       " ('melayu,', 'person'),\n",
       " ('agama', 'person'),\n",
       " ('islam,', 'person'),\n",
       " ('tinggal', 'OTHER'),\n",
       " ('di', 'OTHER'),\n",
       " ('cyberjaya', 'location'),\n",
       " ('malaysia,', 'OTHER'),\n",
       " ('bercakap', 'OTHER'),\n",
       " ('bahasa', 'OTHER'),\n",
       " ('melayu,', 'OTHER'),\n",
       " ('semua', 'OTHER'),\n",
       " ('membaca', 'OTHER'),\n",
       " ('buku', 'OTHER'),\n",
       " ('undang-undang', 'law'),\n",
       " ('kewangan,', 'event'),\n",
       " ('dengar', 'OTHER'),\n",
       " ('laju', 'OTHER'),\n",
       " ('Siti', 'person'),\n",
       " ('Nurhaliza', 'person'),\n",
       " ('-', 'person'),\n",
       " ('Seluruh', 'person'),\n",
       " ('Cinta', 'event'),\n",
       " ('sambil', 'OTHER'),\n",
       " ('makan', 'OTHER'),\n",
       " ('ayam', 'OTHER'),\n",
       " ('goreng', 'OTHER'),\n",
       " ('KFC', 'person')]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.predict(string1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Group similar tags\n",
    "\n",
    "```python\n",
    "def analyze(self, string: str):\n",
    "        \"\"\"\n",
    "        Analyze a string.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        string : str\n",
    "\n",
    "        Returns\n",
    "        -------\n",
    "        result: {'words': List[str], 'tags': [{'text': 'text', 'type': 'location', 'score': 1.0, 'beginOffset': 0, 'endOffset': 1}]}\n",
    "        \"\"\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': ['KUALA', 'LUMPUR:'],\n",
       "  'type': 'location',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 0,\n",
       "  'endOffset': 2},\n",
       " {'text': ['Sempena', 'sambutan'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 2,\n",
       "  'endOffset': 4},\n",
       " {'text': ['Aidilfitri'],\n",
       "  'type': 'event',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 4,\n",
       "  'endOffset': 5},\n",
       " {'text': ['minggu', 'depan,'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 5,\n",
       "  'endOffset': 7},\n",
       " {'text': ['Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 7,\n",
       "  'endOffset': 13},\n",
       " {'text': ['dan'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 13,\n",
       "  'endOffset': 14},\n",
       " {'text': ['Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 14,\n",
       "  'endOffset': 20},\n",
       " {'text': ['menitipkan',\n",
       "   'pesanan',\n",
       "   'khas',\n",
       "   'kepada',\n",
       "   'orang',\n",
       "   'ramai',\n",
       "   'yang',\n",
       "   'mahu',\n",
       "   'pulang',\n",
       "   'ke',\n",
       "   'kampung',\n",
       "   'halaman',\n",
       "   'masing-masing.',\n",
       "   'Dalam',\n",
       "   'video',\n",
       "   'pendek',\n",
       "   'terbitan'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 20,\n",
       "  'endOffset': 37},\n",
       " {'text': ['Jabatan', 'Keselamatan', 'Jalan', 'Raya', '(JKJR)'],\n",
       "  'type': 'organization',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 37,\n",
       "  'endOffset': 42},\n",
       " {'text': ['itu,'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 42,\n",
       "  'endOffset': 43},\n",
       " {'text': ['Dr', 'Mahathir'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 43,\n",
       "  'endOffset': 45},\n",
       " {'text': ['menasihati',\n",
       "   'mereka',\n",
       "   'supaya',\n",
       "   'berhenti',\n",
       "   'berehat',\n",
       "   'dan',\n",
       "   'tidur',\n",
       "   'sebentar',\n",
       "   'sekiranya',\n",
       "   'mengantuk',\n",
       "   'ketika',\n",
       "   'memandu.'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 45,\n",
       "  'endOffset': 57}]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.analyze(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'text': ['memperkenalkan'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 0,\n",
       "  'endOffset': 1},\n",
       " {'text': ['Husein,'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 1,\n",
       "  'endOffset': 2},\n",
       " {'text': ['dia', 'sangat', 'comel,', 'berumur', '25', 'tahun,', 'bangsa'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 2,\n",
       "  'endOffset': 9},\n",
       " {'text': ['melayu,', 'agama', 'islam,'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 9,\n",
       "  'endOffset': 12},\n",
       " {'text': ['tinggal', 'di'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 12,\n",
       "  'endOffset': 14},\n",
       " {'text': ['cyberjaya'],\n",
       "  'type': 'location',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 14,\n",
       "  'endOffset': 15},\n",
       " {'text': ['malaysia,',\n",
       "   'bercakap',\n",
       "   'bahasa',\n",
       "   'melayu,',\n",
       "   'semua',\n",
       "   'membaca',\n",
       "   'buku'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 15,\n",
       "  'endOffset': 22},\n",
       " {'text': ['undang-undang'],\n",
       "  'type': 'law',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 22,\n",
       "  'endOffset': 23},\n",
       " {'text': ['kewangan,'],\n",
       "  'type': 'event',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 23,\n",
       "  'endOffset': 24},\n",
       " {'text': ['dengar', 'laju'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 24,\n",
       "  'endOffset': 26},\n",
       " {'text': ['Siti', 'Nurhaliza', '-', 'Seluruh'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 26,\n",
       "  'endOffset': 30},\n",
       " {'text': ['Cinta'],\n",
       "  'type': 'event',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 30,\n",
       "  'endOffset': 31},\n",
       " {'text': ['sambil', 'makan', 'ayam', 'goreng'],\n",
       "  'type': 'OTHER',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 31,\n",
       "  'endOffset': 35},\n",
       " {'text': ['KFC'],\n",
       "  'type': 'person',\n",
       "  'score': 1.0,\n",
       "  'beginOffset': 35,\n",
       "  'endOffset': 36}]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.analyze(string1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load general Malaya entity model\n",
    "\n",
    "This model able to classify,\n",
    "\n",
    "1. date\n",
    "2. money\n",
    "3. temperature\n",
    "4. distance\n",
    "5. volume\n",
    "6. duration\n",
    "7. phone\n",
    "8. email\n",
    "9. url\n",
    "10. time\n",
    "11. datetime\n",
    "12. local and generic foods, can check available rules in malaya.texts._food\n",
    "13. local and generic drinks, can check available rules in malaya.texts._food\n",
    "\n",
    "We can insert BERT or any deep learning model by passing `malaya.entity.general_entity(model = model)`, as long the model has `predict` method and return `[(string, label), (string, label)]`. This is an optional."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "entity = malaya.entity.general_entity(model = model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'person': ['Husein', 'milo o ais'],\n",
       " 'OTHER': ['baca buku',\n",
       "  'yang berharga',\n",
       "  'dekat',\n",
       "  ', suhu',\n",
       "  'sambil makan ayam goreng dan'],\n",
       " 'law': ['Perlembagaan'],\n",
       " 'quantity': ['3k ringgit', '32 celcius,'],\n",
       " 'location': ['kfc sungai petani'],\n",
       " 'time': {'minggu lalu, 2 PM 2 oktober 2019': None,\n",
       "  '2 PM': datetime.datetime(2023, 10, 12, 14, 0)},\n",
       " 'date': {'2 oktober 2019': datetime.datetime(2019, 10, 2, 0, 0),\n",
       "  'minggu lalu': datetime.datetime(2023, 10, 5, 15, 37, 26, 457099)},\n",
       " 'money': {'3k ringgit': 'RM3000.0'},\n",
       " 'temperature': ['32 celcius'],\n",
       " 'distance': [],\n",
       " 'volume': [],\n",
       " 'duration': [],\n",
       " 'phone': [],\n",
       " 'email': [],\n",
       " 'url': [],\n",
       " 'datetime': {'2 ptg 2 oktober 2019': datetime.datetime(2019, 10, 2, 14, 0)},\n",
       " 'food': ['ayam goreng'],\n",
       " 'drink': ['milo o ais'],\n",
       " 'weight': []}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entity.predict('Husein baca buku Perlembagaan yang berharga 3k ringgit dekat kfc sungai petani minggu lepas, 2 ptg 2 oktober 2019 , suhu 32 celcius, sambil makan ayam goreng dan milo o ais')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OTHER': ['contact', 'at'],\n",
       " 'person': ['Husein'],\n",
       " 'organization': ['husein.zol05@gmail.com'],\n",
       " 'date': {},\n",
       " 'money': {},\n",
       " 'temperature': [],\n",
       " 'distance': [],\n",
       " 'volume': [],\n",
       " 'duration': [],\n",
       " 'phone': [],\n",
       " 'email': ['husein.zol05@gmail.com'],\n",
       " 'url': [],\n",
       " 'time': {},\n",
       " 'datetime': {},\n",
       " 'food': [],\n",
       " 'drink': [],\n",
       " 'weight': []}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entity.predict('contact Husein at husein.zol05@gmail.com')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'OTHER': ['tolong tempahkan meja makan makan nasi dagang dan',\n",
       "  'tarik esok dekat'],\n",
       " 'person': ['jus'],\n",
       " 'event': ['apple, milo'],\n",
       " 'location': ['Restoran Sebulek'],\n",
       " 'date': {'esok': datetime.datetime(2023, 10, 13, 15, 37, 33, 353440)},\n",
       " 'money': {},\n",
       " 'temperature': [],\n",
       " 'distance': [],\n",
       " 'volume': [],\n",
       " 'duration': [],\n",
       " 'phone': [],\n",
       " 'email': [],\n",
       " 'url': [],\n",
       " 'time': {},\n",
       " 'datetime': {},\n",
       " 'food': ['nasi dagang'],\n",
       " 'drink': ['milo tarik', 'jus apple'],\n",
       " 'weight': []}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "entity.predict('tolong tempahkan meja makan makan nasi dagang dan jus apple, milo tarik esok dekat Restoran Sebulek')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Voting stack model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "tiny = malaya.entity.huggingface(model = 'mesolitica/ner-t5-tiny-standard-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('memperkenalkan', 'OTHER'),\n",
       " ('Husein,', 'person'),\n",
       " ('dia', 'OTHER'),\n",
       " ('sangat', 'OTHER'),\n",
       " ('comel,', 'OTHER'),\n",
       " ('berumur', 'OTHER'),\n",
       " ('25', 'OTHER'),\n",
       " ('tahun,', 'OTHER'),\n",
       " ('bangsa', 'OTHER'),\n",
       " ('melayu,', 'person'),\n",
       " ('agama', 'person'),\n",
       " ('islam,', 'person'),\n",
       " ('tinggal', 'OTHER'),\n",
       " ('di', 'OTHER'),\n",
       " ('cyberjaya', 'location'),\n",
       " ('malaysia,', 'OTHER'),\n",
       " ('bercakap', 'OTHER'),\n",
       " ('bahasa', 'OTHER'),\n",
       " ('melayu,', 'OTHER'),\n",
       " ('semua', 'OTHER'),\n",
       " ('membaca', 'OTHER'),\n",
       " ('buku', 'OTHER'),\n",
       " ('undang-undang', 'law'),\n",
       " ('kewangan,', 'event'),\n",
       " ('dengar', 'OTHER'),\n",
       " ('laju', 'OTHER'),\n",
       " ('Siti', 'person'),\n",
       " ('Nurhaliza', 'person'),\n",
       " ('-', 'person'),\n",
       " ('Seluruh', 'person'),\n",
       " ('Cinta', 'event'),\n",
       " ('sambil', 'OTHER'),\n",
       " ('makan', 'OTHER'),\n",
       " ('ayam', 'OTHER'),\n",
       " ('goreng', 'OTHER'),\n",
       " ('KFC', 'person')]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "malaya.stack.voting_stack([tiny, model, model], string1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
